Scratch Pad for 2024 Cyclistic Data Analysis

Exploratory Data Analysis, unencumbered by the need to be too presentable.

Load data

##             used   (Mb) gc trigger (Mb)  max used   (Mb)
## Ncells   7102638  379.4   12544510  670   7866112  420.1
## Vcells 152792958 1165.8  283901216 2166 277032025 2113.6

Drop duplicate rows

df %>%
  count(ride_id) %>%
  filter(n > 1)
## # A tibble: 211 × 2
##    ride_id              n
##    <chr>            <int>
##  1 011C8EF97AB0F30D     2
##  2 01406457A85B0AFF     2
##  3 02606FBC7F8537EE     2
##  4 0354FD0756337B59     2
##  5 048C715F1DE0D8C0     2
##  6 05D27072A33A290C     2
##  7 0625A51D397A68F9     2
##  8 076F662AFB9CAEC1     2
##  9 07DBFDA3C91006AE     2
## 10 0AE60485F97A473D     2
## # ℹ 201 more rows
df <- df %>% distinct(ride_id, .keep_all = TRUE)
df %>%
  count(ride_id) %>%
  filter(n > 1)
## # A tibble: 0 × 2
## # ℹ 2 variables: ride_id <chr>, n <int>
table(df$month)
## 
##     01     02     03     04     05     06     07     08     09     10     11 
## 144853 223159 301662 414967 609417 710747 749004 755804 820832 616292 334999 
##     12      9 
## 178359     35

Reassigning the month “9” to “09”.

df$month <- sprintf("%02d", as.numeric(format(df$date, "%m")))
table(df$month)
## 
##     01     02     03     04     05     06     07     08     09     10     11 
## 144853 223159 301662 414967 609417 710747 749004 755804 820867 616292 334999 
##     12 
## 178359

Number of member vs casual rides

table(df$member_casual)
## 
##  casual  member 
## 2151431 3708699

3.7 million member rides vs 2.2 million casual rides. Unfortunately, there is no way to tell how many are repeat users. Presumably, many member riders repeat far more often than casual riders. The scenario puts members at 30% of Cyclistic.

In the scenario, and in real life, the data is shared without any sort of identifying information that would help track that, probably why after 2019 they removed gender and birth year. It wasn’t much help anyway.

NA stations and e-bikes

# NA value counts
na_counts <- sapply(df, function(x) sum(is.na(x)))

# print
print(na_counts)
##            ride_id      rideable_type         started_at           ended_at 
##                  0                  0                  0                  0 
## start_station_name   start_station_id   end_station_name     end_station_id 
##            1073800            1073800            1104419            1104419 
##          start_lat          start_lng            end_lat            end_lng 
##                  0                  0               7192               7192 
##      member_casual               date              month                day 
##                  0                  0                  0                  0 
##               year        day_of_week             season               hour 
##                  0                  0                  0                  0 
##        ride_length 
##                  0

Start station counts by rideable_type

df %>%
  mutate(missing_name = is.na(start_station_name)) %>%
  group_by(rideable_type) %>%
  summarize(missing_count = sum(missing_name), total = n(), perc_missing = mean(missing_name)) %>%
  arrange(desc(perc_missing))
## # A tibble: 3 × 4
##   rideable_type    missing_count   total perc_missing
##   <chr>                    <int>   <int>        <dbl>
## 1 electric_scooter         67649  144337        0.469
## 2 electric_bike          1006151 2980299        0.338
## 3 classic_bike                 0 2735494        0

Yup. Missing stations are electric bikes without a docking station, left randomly throughout the city.

End stations counts by rideable_type

df %>%
  mutate(missing_name = is.na(end_station_name)) %>%
  group_by(rideable_type) %>%
  summarize(missing_count = sum(missing_name), total = n(), perc_missing = mean(missing_name)) %>%
  arrange(desc(perc_missing))
## # A tibble: 3 × 4
##   rideable_type    missing_count   total perc_missing
##   <chr>                    <int>   <int>        <dbl>
## 1 electric_scooter         70340  144337      0.487  
## 2 electric_bike          1026633 2980299      0.344  
## 3 classic_bike              7446 2735494      0.00272
known_stations <- df %>%
  filter(!is.na(start_station_id) & !is.na(start_station_name)) %>%
  select(start_station_id, start_station_name, start_lat, start_lng) %>%
  distinct()
print(count(known_stations))
## # A tibble: 1 × 1
##        n
##    <int>
## 1 977569
# attempt to consolidate known stations
known_stations <- df %>%
  filter(!is.na(start_station_id) & !is.na(start_station_name)) %>%
  mutate(
    start_lat = round(start_lat, 4), # roughly the length of a bus
    start_lng = round(start_lng, 4)
  ) %>%
  select(start_station_id, start_station_name, start_lat, start_lng) %>%
  distinct()
print(count(known_stations))
## # A tibble: 1 × 1
##       n
##   <int>
## 1 40269

Stations no longer have a specific lat/lng. No exact station location maps anymore. Have to do heatmaps.

# distinct start_station_name count
distinct_start_station_count <- df %>%
  summarise(distinct_count = n_distinct(start_station_name))

# print
print(distinct_start_station_count)
## # A tibble: 1 × 1
##   distinct_count
##            <int>
## 1           1809

It’s supposed to be 692, per the scenario. (I found 608 in 2019).

# distinct start_station_id count
distinct_start_station_count <- df %>%
  summarise(distinct_count = n_distinct(start_station_id))

# print
print(distinct_start_station_count)
## # A tibble: 1 × 1
##   distinct_count
##            <int>
## 1           1764

So, 45 duplicate station names, probably easily explained. Easier than the extra ~1000 station id’s.

Electronic bikes are a game changer. I thought switching from 2019 to 2024 was gonna be easy.

Ride length (and NA end_lat/lng)

IQR

summary(df$ride_length)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##    0.000    5.545    9.717   17.304   17.250 1559.933

The mean is higher than the 3rd quartile. Data set skews left. Makes sense, as many commuter rides are quick, but leisure rides can last a while. And sometimes people just dump the bike in a ditch so it appears to be a long ride.

max(df$ride_length)/60/24
## [1] 1.083287

A little over a day. Somebody forgot to dock a bike.

print(df[which.max(df$ride_length), ])
## # A tibble: 1 × 21
##   ride_id          rideable_type started_at          ended_at           
##   <chr>            <chr>         <dttm>              <dttm>             
## 1 2E7A26B0B6513B10 classic_bike  2024-03-09 07:48:01 2024-03-10 09:47:57
## # ℹ 17 more variables: start_station_name <chr>, start_station_id <chr>,
## #   end_station_name <chr>, end_station_id <chr>, start_lat <dbl>,
## #   start_lng <dbl>, end_lat <dbl>, end_lng <dbl>, member_casual <chr>,
## #   date <date>, month <chr>, day <chr>, year <dbl>, day_of_week <chr>,
## #   season <chr>, hour <chr>, ride_length <dbl>

Average of rides with no end lat(/lng also missing)

# average ride_length for rows with missing end_lat
avg_ride_length_missing_end_lat <- df %>%
  filter(is.na(end_lat)) %>%
  summarise(avg_ride_length = mean(ride_length, na.rm = TRUE))

# print
print(avg_ride_length_missing_end_lat)/60/24
## # A tibble: 1 × 1
##   avg_ride_length
##             <dbl>
## 1           1500.
##   avg_ride_length
## 1         1.04154

A little over a day.

Min of rides with no end lat(/lng also missing)

# minimum ride_length for rows with missing end_lat
min_ride_length_missing_end_lat <- df %>%
  filter(is.na(end_lat)) %>%
  summarise(min_ride_length = min(ride_length, na.rm = TRUE))

# print
print(min_ride_length_missing_end_lat)/60/24
## # A tibble: 1 × 1
##   min_ride_length
##             <dbl>
## 1           1440.
##   min_ride_length
## 1       0.9997483
## histogram of ride_length
ggplot(df, aes(x = ride_length)) +
  geom_histogram(binwidth = 5, fill = "blue", color = "black", alpha = 0.7) +
  scale_x_continuous(limits = c(0, quantile(df$ride_length, 0.99))) + # trim extreme outliers
  labs(title = "Distribution of Ride Length",
       subtitle = "Removed the 99th percentile outliers - the 1%!",
       x = "Ride Length (minutes)",
       y = "Count") +
  theme_minimal()
## Warning: Removed 58602 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

Ride length outliers / IQR

Note that I can toggle between these and “run all” to see the impact… and the power to skew results. Mwahahahaha!

# IQR * 1.5 for outliers
# calculate Q1, Q3, and IQR
Q1 <- quantile(df$ride_length, 0.25, na.rm = TRUE)
Q3 <- quantile(df$ride_length, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1

# lower and upper bounds for outliers
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR

# filter out outliers
df_no_outliers <- df %>%
  filter(ride_length >= lower_bound & ride_length <= upper_bound)

# print
summary(df_no_outliers$ride_length)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   5.270   8.982  10.947  14.883  34.807
# IQR * 3 for outliers
lower_bound_strict <- Q1 - 3 * IQR
upper_bound_strict <- Q3 + 3 * IQR

# filter out outliers
df_no_outliers <- df %>%
  filter(ride_length >= lower_bound_strict & ride_length <= upper_bound_strict)

# print
summary(df_no_outliers$ride_length)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   5.421   9.382  12.278  16.117  52.363
# remove the 99th percentile
upper_1_percent <- quantile(df$ride_length, 0.99, na.rm = TRUE)

# filter out the top 1% of the data
df_no_outliers <- df %>%
  filter(ride_length <= upper_1_percent)

# print
summary(df_no_outliers$ride_length)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   5.508   9.616  13.644  16.900 100.890
# the 99.5 percentile
upper_.5_percent <- quantile(df$ride_length, 0.995, na.rm = TRUE)

# filter out the top 0.5% of the data
df_no_outliers <- df %>%
  filter(ride_length <= upper_.5_percent)

# print
summary(df_no_outliers$ride_length)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   5.527   9.667  14.178  17.074 149.121
# the 99.75 percentile
upper_.25_percent <- quantile(df$ride_length, 0.9975, na.rm = TRUE)

# filter out the top 0.025% of the data
df_no_outliers <- df %>%
  filter(ride_length <= upper_.25_percent)

# print
summary(df_no_outliers$ride_length)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   5.533   9.688  14.613  17.164 279.700

I will very conservatively drop the 0.25% of rides over four and a half hours long.

Visualizations

Rides by Member type (total rides, duration)

# sum total ride length by membership type
total_minutes_ridden <- df %>%
  group_by(member_casual) %>%
  summarise(total_minutes = sum(ride_length, na.rm = TRUE))

# bar chart of total minutes ridden
ggplot(total_minutes_ridden, aes(x = member_casual, y = total_minutes, fill = member_casual)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = round(total_minutes)), vjust = -0.5) +
  theme_minimal() +
  labs(title = "Total Minutes Ridden by Membership Type", y = "Total Minutes Ridden", x = "Rider Type")

# sum total ride length by membership type
total_minutes_ridden <- df_no_outliers %>%
  group_by(member_casual) %>%
  summarise(total_minutes = sum(ride_length, na.rm = TRUE))

# bar chart of total minutes ridden
ggplot(total_minutes_ridden, aes(x = member_casual, y = total_minutes, fill = member_casual)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = round(total_minutes)), vjust = -0.5) +
  theme_minimal() +
  labs(title = "Total Minutes Ridden by Membership Type, No Outliers", y = "Total Minutes Ridden", x = "Rider Type")

# sum average ride length by membership type
average_ride_length <- df %>%
  group_by(member_casual) %>%
  summarise(average_minutes = mean(ride_length, na.rm = TRUE))

# bar chart of average ride length
ggplot(average_ride_length, aes(x = member_casual, y = average_minutes, fill = member_casual)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = round(average_minutes, 1)), vjust = -0.5) +
  theme_minimal() +
  labs(title = "Average Ride Length by Membership Type", y = "Average Ride Length (minutes)", x = "Rider Type")

# sum average ride length by membership type
average_ride_length <- df_no_outliers %>%
  group_by(member_casual) %>%
  summarise(average_minutes = mean(ride_length, na.rm = TRUE))

# bar chart of average ride length
ggplot(average_ride_length, aes(x = member_casual, y = average_minutes, fill = member_casual)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = round(average_minutes, 1)), vjust = -0.5) +
  theme_minimal() +
  labs(title = "Average Ride Length by Membership Type, No Outliers", y = "Average Ride Length (minutes)", x = "Rider Type")

# bar chart 0f ride frequency per user
ggplot(df, aes(x = member_casual, fill = member_casual)) +
  geom_bar() +
  geom_text(stat = "count", aes(label = ..count..), vjust = -0.5) +
  labs(title = "Total Rides by Membership Type",
       x = "Rider Type",
       y = "Total Rides") +
  theme_minimal()
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# bar chart 0f ride frequency per user
ggplot(df_no_outliers, aes(x = member_casual, fill = member_casual)) +
  geom_bar() +
  geom_text(stat = "count", aes(label = ..count..), vjust = -0.5) +
  labs(title = "Total Rides by Membership Type, No Outliers",
       x = "Rider Type",
       y = "Total Rides") +
  theme_minimal()

# histograms of ride length per user type
ggplot(df, aes(x = ride_length, fill = member_casual)) +
  geom_histogram(binwidth = 5, alpha = 0.7) +
  facet_wrap(~ member_casual, scales = "free_y") +
  labs(title = "Histogram of Ride Lengths by Membership Type",
       x = "Ride Length (minutes)",
       y = "Frequency") +
  theme_minimal() +
  # scale_fill_manual(values = c("member" = "blue", "casual" = "red")) +
  xlim(0,100)
## Warning: Removed 59626 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_bar()`).

Members ride more frequently for shorter duration (probably commuting).

Casuals ride longer (probably leisure).

Time and Ridership

# line chart of rides per month
df %>%
  group_by(month, member_casual) %>%
  summarise(rides = n()) %>%
  ggplot(aes(x = month, y = rides, color = member_casual, group = member_casual)) +
  geom_line() +
  geom_point() +
  theme_minimal() +
  labs(title = "Rides per Month, by Customer Type", x = "Month", y = "Number of Rides")
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.

Peak in summer, unsurprisingly.

# bar chart of rides per month
df %>%
  group_by(month, member_casual) %>%
  summarise(rides = n()) %>%
  ggplot(aes(x = month, y = rides, fill = member_casual)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_text(aes(label = rides), vjust = -0.5, position = position_dodge(0.9)) +
  theme_minimal() +
  labs(title = "Total Rides per Month by Customer Type", x = "Month", y = "Number of Rides")
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.

# bar chart of ride length per month
df %>%
  group_by(month, member_casual) %>%
  summarise(avg_ride_length = mean(ride_length, na.rm = TRUE)) %>%
  ggplot(aes(x = month, y = avg_ride_length, fill = member_casual)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_text(aes(label = round(avg_ride_length, 1)), vjust = -0.5, position = position_dodge(0.9)) +
  theme_minimal() +
  labs(title = "Average Trip Duration per Month by Customer Type", x = "Month", y = "Average Trip Duration (minutes)")
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.

Here is where I noticed a “9” month. There were 35 rides in the month “9” instead of “09”. Stuck out at 100 and 450 minute long trip durations (outliers). I fixed it up at the top.

# rides per season
df %>%
  mutate(season = factor(season, levels = c("Winter", "Spring", "Summer", "Fall"))) %>%
  group_by(season, member_casual) %>%
  summarise(rides = n()) %>%
  ggplot(aes(x = season, y = rides, fill = member_casual)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_text(aes(label = rides), vjust = -0.5, position = position_dodge(0.9)) +
  theme_minimal() +
  labs(title = "Total Rides per Season by Customer Type", x = "Season", y = "Number of Rides") +
  scale_y_continuous(expand = expansion(mult = c(0, 0.2)))
## `summarise()` has grouped output by 'season'. You can override using the
## `.groups` argument.

# ride length per season
df %>%
  mutate(season = factor(season, levels = c("Winter", "Spring", "Summer", "Fall"))) %>%
  group_by(season, member_casual) %>%
  summarise(avg_ride_length = mean(ride_length, na.rm = TRUE)) %>%
  ggplot(aes(x = season, y = avg_ride_length, fill = member_casual)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_text(aes(label = round(avg_ride_length, 1)), vjust = -0.5, position = position_dodge(0.9)) +
  theme_minimal() +
  labs(title = "Average Trip Duration per Season by Customer Type", x = "Season", y = "Average Trip Duration (minutes)") +
  scale_y_continuous(expand = expansion(mult = c(0, 0.2)))
## `summarise()` has grouped output by 'season'. You can override using the
## `.groups` argument.

# line chart of rides per hour
df %>%
  group_by(hour, member_casual) %>%
  summarise(rides = n()) %>%
  ggplot(aes(x = hour, y = rides, color = member_casual, group = member_casual)) +
  geom_line() +
  geom_point() +
  theme_minimal() +
  labs(title = "Rides per Hour, by Customer Type", x = "Hour", y = "Number of Rides")
## `summarise()` has grouped output by 'hour'. You can override using the
## `.groups` argument.

# bar chart of rides per hour
df %>%
  group_by(hour, member_casual) %>%
  summarise(rides = n()) %>%
  ggplot(aes(x = hour, y = rides, fill = member_casual)) +
  geom_bar(stat = "identity", position = "dodge") +
  theme_minimal() +
  labs(title = "Rides per Hour of Day by Customer Type", x = "Hour of Day", y = "Number of Rides")
## `summarise()` has grouped output by 'hour'. You can override using the
## `.groups` argument.

# bar chart of rides per hour by day of week, free y-axis
df %>%
  mutate(day_of_week = factor(day_of_week, levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))) %>%
  group_by(hour, member_casual, day_of_week) %>%
  summarise(rides = n()) %>%
  ggplot(aes(x = hour, y = rides, fill = member_casual)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~ day_of_week, scales = "free_y") +
  theme_minimal() +
  labs(title = "Rides per Hour of Day by Customer Type and Day of Week", subtitle = "Free y-axis", x = "Hour of Day", y = "Number of Rides")
## `summarise()` has grouped output by 'hour', 'member_casual'. You can override
## using the `.groups` argument.

# bar chart of rides per hour by day of week, locked y-axis
df %>%
  mutate(day_of_week = factor(day_of_week, levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))) %>%
  group_by(hour, member_casual, day_of_week) %>%
  summarise(rides = n()) %>%
  ggplot(aes(x = hour, y = rides, fill = member_casual)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~ day_of_week) +
  theme_minimal() +
  labs(title = "Rides per Hour of Day by Customer Type and Day of Week", subtitle = "Locked y-axis", x = "Hour of Day", y = "Number of Rides")
## `summarise()` has grouped output by 'hour', 'member_casual'. You can override
## using the `.groups` argument.

# bar chart of rides / member_casual / day of week
df %>%
  mutate(day_of_week = factor(day_of_week, levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))) %>%
  group_by(member_casual, day_of_week) %>%
  summarise(rides = n()) %>%
  ggplot(aes(x = member_casual, y = rides, fill = member_casual)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_text(aes(label = rides), vjust = -0.5, position = position_dodge(0.9)) +
  facet_wrap(~ day_of_week) +
  theme_minimal() +
  labs(title = "Rides per Member Type by Day of Week", x = "Member Type", y = "Number of Rides") +
  scale_y_continuous(expand = expansion(mult = c(0, 0.2)))
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

# bar chart of rides / member_casual / day of week
df %>%
  mutate(day_of_week = factor(day_of_week, levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))) %>%
  group_by(member_casual, day_of_week) %>%
  summarise(rides = n()) %>%
  ggplot(aes(x = day_of_week, y = rides, fill = member_casual)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_text(aes(label = rides), vjust = -0.5, position = position_dodge(0.9)) +
  theme_minimal() +
  labs(title = "Rides per Member Type by Day of Week", x = "Member Type", y = "Number of Rides") +
  scale_y_continuous(expand = expansion(mult = c(0, 0.2)))
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

# bar chart avg ride time / member_casual / day of week
df %>%
  mutate(day_of_week = factor(day_of_week, levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))) %>%
  group_by(member_casual, day_of_week) %>%
  summarise(avg_ride_length = mean(ride_length, na.rm = TRUE)) %>%
  ggplot(aes(x = member_casual, y = avg_ride_length, fill = member_casual)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_text(aes(label = round(avg_ride_length, 1)), vjust = -0.5, position = position_dodge(0.9)) +
  facet_wrap(~ day_of_week) +
  theme_minimal() +
  labs(title = "Average Ride Length per Member Type by Day of Week", x = "Member Type", y = "Average Ride Length (minutes)") +
  scale_y_continuous(expand = expansion(mult = c(0, 0.2)))
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

# bar chart avg ride time / member_casual / day of week
df_no_outliers %>%
  mutate(day_of_week = factor(day_of_week, levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))) %>%
  group_by(member_casual, day_of_week) %>%
  summarise(avg_ride_length = mean(ride_length, na.rm = TRUE)) %>%
  ggplot(aes(x = member_casual, y = avg_ride_length, fill = member_casual)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_text(aes(label = round(avg_ride_length, 1)), vjust = -0.5, position = position_dodge(0.9)) +
  facet_wrap(~ day_of_week) +
  theme_minimal() +
  labs(title = "Average Ride Length per Member Type by Day of Week, No Outliers", x = "Member Type", y = "Average Ride Length (minutes)") +
  scale_y_continuous(expand = expansion(mult = c(0, 0.2)))
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

# bar chart avg ride time / member_casual / day of week
df %>%
  mutate(day_of_week = factor(day_of_week, levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))) %>%
  group_by(member_casual, day_of_week) %>%
  summarise(avg_ride_length = mean(ride_length, na.rm = TRUE)) %>%
  ggplot(aes(x = day_of_week, y = avg_ride_length, fill = member_casual)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_text(aes(label = round(avg_ride_length, 1)), vjust = -0.5, position = position_dodge(0.9)) +
  theme_minimal() +
  labs(title = "Average Ride Length per Member Type by Day of Week", x = "Member Type", y = "Average Ride Length (minutes)") +
  scale_y_continuous(expand = expansion(mult = c(0, 0.2)))
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

# bar chart avg ride time / member_casual / day of week
df_no_outliers %>%
  mutate(day_of_week = factor(day_of_week, levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))) %>%
  group_by(member_casual, day_of_week) %>%
  summarise(avg_ride_length = mean(ride_length, na.rm = TRUE)) %>%
  ggplot(aes(x = day_of_week, y = avg_ride_length, fill = member_casual)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_text(aes(label = round(avg_ride_length, 1)), vjust = -0.5, position = position_dodge(0.9)) +
  theme_minimal() +
  labs(title = "Average Ride Length per Member Type by Day of Week, No Outliers", x = "Member Type", y = "Average Ride Length (minutes)") +
  scale_y_continuous(expand = expansion(mult = c(0, 0.2)))
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

# aggregate member rider counts by hour and day of the week
member_peak_times <- df %>%
  mutate(day_of_week = factor(day_of_week, levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))) %>%  filter(member_casual == "member") %>%
  group_by(day_of_week, hour) %>%
  summarise(rides = n(), .groups = "drop")

# heatmap of peak times for member riders
ggplot(member_peak_times, aes(x = hour, y = day_of_week, fill = rides)) +
  geom_tile() +
  scale_fill_viridis_c() +
  theme_minimal() +
  labs(title = "Peak Usage Times for Member Riders", x = "Hour of Day", y = "Day of Week")

Members peak with commuter hours.

# aggregate casual rider counts by hour and day of the week
casual_peak_times <- df %>%
  mutate(day_of_week = factor(day_of_week, levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))) %>%  filter(member_casual == "casual") %>%
  group_by(day_of_week, hour) %>%
  summarise(rides = n(), .groups = "drop")

# heatmap of peak times for casual riders
ggplot(casual_peak_times, aes(x = hour, y = day_of_week, fill = rides)) +
  geom_tile() +
  scale_fill_viridis_c() +
  theme_minimal() +
  labs(title = "Peak Usage Times for Casual Riders", x = "Hour of Day", y = "Day of Week")

Peak time is Saturday afternoon, followed by Sunday afternoon. But also a little around peak end-of-work hour. Are casual riders transiting in to work, and biking back?

Bike Type

# bar chart of total count of rideable_type
ggplot(df, aes(x = rideable_type, fill = rideable_type)) +
  geom_bar() +
  geom_text(stat = "count", aes(label = ..count..), vjust = -0.5) +
  theme_minimal() +
  labs(title = "Total Count of Rideable Types",
       x = "Rideable Type",
       y = "Count")

# bar chart of rideable_type count per member_casual
ggplot(df, aes(x = rideable_type, fill = member_casual)) +
  geom_bar(position = "dodge") +
  geom_text(stat = "count", aes(label = ..count..), vjust = -0.5, position = position_dodge(0.9)) +
  theme_minimal() +
  labs(title = "Count of Rideable Types per Member Type",
       x = "Rideable Type",
       y = "Count")

# boxplot, ride length by rideable type and member type
ggplot(df, aes(x = rideable_type, y = ride_length, fill = member_casual)) +
  geom_boxplot() +
  theme_minimal() +
  labs(title = "Ride Length Distribution by Rideable Type and Member Type",
       x = "Rideable Type",
       y = "Ride Length (minutes)")

Classic pedal bikes are favored. Is that because there’s a bigger supply, or greater consumer preference?

# bar chart: ride frequency by day of the week
df %>%
  mutate(day_of_week = factor(day_of_week, levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))) %>%
  group_by(day_of_week, rideable_type, member_casual) %>%
  summarise(rides = n()) %>%
  ggplot(aes(x = day_of_week, y = rides, fill = member_casual)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~ rideable_type) +
  theme_minimal() +
  labs(title = "Ride Frequency by Day of the Week",
       x = "Day of the Week",
       y = "Number of Rides")
## `summarise()` has grouped output by 'day_of_week', 'rideable_type'. You can
## override using the `.groups` argument.

# line chart of ride frequency by hour of the day
df %>%
  group_by(hour, rideable_type, member_casual) %>%
  summarise(rides = n()) %>%
  ggplot(aes(x = hour, y = rides, color = member_casual, group = member_casual)) +
  geom_line() +
  facet_wrap(~ rideable_type) +
  theme_minimal() +
  labs(title = "Ride Frequency by Hour of the Day",
       x = "Hour of the Day",
       y = "Number of Rides")
## `summarise()` has grouped output by 'hour', 'rideable_type'. You can override
## using the `.groups` argument.

# heatmap of ride frequency / day of the week / hour of the day
df %>%
  mutate(day_of_week = factor(day_of_week, levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))) %>%
  group_by(day_of_week, hour, rideable_type, member_casual) %>%
  summarise(rides = n()) %>%
  ggplot(aes(x = hour, y = day_of_week, fill = rides)) +
  geom_tile() +
  facet_wrap(~ rideable_type + member_casual, ncol = 2) +
  scale_fill_viridis_c() +
  theme_minimal() +
  labs(title = "Heatmap of Ride Frequency by Day of the Week and Hour of the Day",
       x = "Hour of the Day",
       y = "Day of the Week")
## `summarise()` has grouped output by 'day_of_week', 'hour', 'rideable_type'. You
## can override using the `.groups` argument.

df %>%
  mutate(month = format(as.Date(started_at), "%Y-%m")) %>%
  group_by(month, rideable_type, member_casual) %>%
  summarise(rides = n(), .groups = "drop") %>%
  ggplot(aes(x = month, y = rides, fill = rideable_type)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~ member_casual) +
  theme_minimal() +
  labs(title = "Rideable Type Usage per Month",
       x = "Month",
       y = "Number of Rides") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

what’s going on with electric_scooter there? Did they only briefly demo it in September? Or is this a categorizing error?

Stations

# bar chart: top start stations
df %>%
  count(start_station_name, member_casual, sort = TRUE) %>%
  group_by(member_casual) %>%
  top_n(20, n) %>%
  ggplot(aes(x = reorder(start_station_name, n), y = n, fill = member_casual)) +
  geom_col() +
  coord_flip() +
  theme_minimal() +
  labs(title = "Top Start Stations by Rider Type", x = "Station Name", y = "Number of Rides")

# bar chart: top start stations
df %>%
  filter(!is.na(start_station_name)) %>% 
  count(start_station_name, member_casual, sort = TRUE) %>%
  group_by(member_casual) %>%
  top_n(20, n) %>%
  ggplot(aes(x = reorder(start_station_name, n), y = n, fill = member_casual)) +
  geom_col() +
  coord_flip() +
  theme_minimal() +
  labs(title = "Top Start Stations by Rider Type", x = "Station Name", y = "Number of Rides")

“Milleniumn Park”, “Shedd Aquarium”, “Theater on the Lake”, “Dusable Lake Shore / Harbor”, and “Adler Planetarium” sound like destinations for the occasional bike rider.

# bar chart: top end stations
df %>%
  filter(!is.na(end_station_name)) %>% 
  count(end_station_name, member_casual, sort = TRUE) %>%
  group_by(member_casual) %>%
  top_n(20, n) %>%
  ggplot(aes(x = reorder(end_station_name, n), y = n, fill = member_casual)) +
  geom_col() +
  coord_flip() +
  theme_minimal() +
  labs(title = "Top End Stations by Rider Type", x = "Station Name", y = "Number of Rides")

# most popular start stations
top_stations <- df %>%
  filter(!is.na(start_station_name)) %>% 
  count(start_station_name, member_casual, sort = TRUE) %>%
  group_by(member_casual) %>%
  top_n(20, n)

print(top_stations)
## # A tibble: 40 × 3
## # Groups:   member_casual [2]
##    start_station_name                 member_casual     n
##    <chr>                              <chr>         <int>
##  1 Streeter Dr & Grand Ave            casual        51049
##  2 DuSable Lake Shore Dr & Monroe St  casual        34102
##  3 Kingsbury St & Kinzie St           member        29522
##  4 Clinton St & Washington Blvd       member        27745
##  5 Michigan Ave & Oak St              casual        25137
##  6 Clinton St & Madison St            member        24893
##  7 Clark St & Elm St                  member        24714
##  8 DuSable Lake Shore Dr & North Blvd casual        23099
##  9 Millennium Park                    casual        22646
## 10 Shedd Aquarium                     casual        21078
## # ℹ 30 more rows
# most popular end stations
top_stations <- df %>%
  filter(!is.na(end_station_name)) %>% 
  count(end_station_name, member_casual, sort = TRUE) %>%
  group_by(member_casual) %>%
  top_n(20, n)

print(top_stations)
## # A tibble: 40 × 3
## # Groups:   member_casual [2]
##    end_station_name                   member_casual     n
##    <chr>                              <chr>         <int>
##  1 Streeter Dr & Grand Ave            casual        54733
##  2 DuSable Lake Shore Dr & Monroe St  casual        31916
##  3 Kingsbury St & Kinzie St           member        29700
##  4 Clinton St & Washington Blvd       member        28449
##  5 DuSable Lake Shore Dr & North Blvd casual        26590
##  6 Clinton St & Madison St            member        26000
##  7 Michigan Ave & Oak St              casual        25784
##  8 Clark St & Elm St                  member        24728
##  9 Millennium Park                    casual        24258
## 10 Wells St & Concord Ln              member        20821
## # ℹ 30 more rows
# Leaflet: popular start stations with cleaned data
library(leaflet)
## Warning: package 'leaflet' was built under R version 4.4.2
# remove rows with missing latitude or longitude values
cleaned_data <- df %>%
  filter(!is.na(start_lng) & !is.na(start_lat))

# get a sample - too many rows, it crashes my computer to plot all 4 million
set.seed(123)  # for reproducibility of sample
cleaned_data <- cleaned_data %>% sample_n(1000)

# plot sampled stations
leaflet(data = cleaned_data) %>%
  addTiles() %>%
  addMarkers(lng = ~start_lng, lat = ~start_lat, clusterOptions = markerClusterOptions())

Remember, there are 1800 stations (per the data), not 692 (per the company - in scenario), for mysterious reasons.

# Leaflet: popular end stations with cleaned data
library(leaflet)

# remove rows with missing latitude or longitude values
cleaned_data <- df %>%
  filter(!is.na(end_lng) & !is.na(end_lat))

# get a sample - too many rows, it crashes my computer to plot all 4 million
set.seed(123)  # for reproducibility of sample
cleaned_data <- cleaned_data %>% sample_n(1000)

# plot sampled stations
leaflet(data = cleaned_data) %>%
  addTiles() %>%
  addMarkers(lng = ~end_lng, lat = ~end_lat, clusterOptions = markerClusterOptions())
library(sf)
library(ggspatial)

# sample data
set.seed(123)  # for reproducibility
cleaned_data <- df %>% sample_n(1000)

# create linestring geometries for routes
routes <- cleaned_data %>%
  drop_na(start_lng, start_lat, end_lng, end_lat) %>%
  rowwise() %>%
  mutate(geometry = st_sfc(st_linestring(matrix(c(start_lng, start_lat, end_lng, end_lat), ncol = 2, byrow = TRUE)), crs = 4326)) %>%
  st_as_sf()

# plot routes
ggplot() +
  annotation_map_tile(type = "osm", zoom = 12) +
  geom_sf(data = routes, aes(geometry = geometry), color = "blue", size = 0.5, alpha = 0.5) +
  labs(title = "Sample Routes",
       x = "Longitude",
       y = "Latitude") +
  theme_minimal()
## Zoom: 12

# aggregate casual rider counts by station
casual_peak_stations <- df %>%
  filter(!is.na(start_station_name)) %>% 
  filter(member_casual == "casual") %>%
  count(start_station_name, sort = TRUE) %>%
  top_n(10, n)

# bar chart of top stations for casual riders
ggplot(casual_peak_stations, aes(x = reorder(start_station_name, n), y = n, fill = start_station_name)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  theme_minimal() +
  labs(title = "Top Start Stations for Casual Riders", x = "Station Name", y = "Number of Rides")

# map of top start stations for casual riders
library(sf)
library(ggspatial)

# convert top casual start stations to sf object
casual_start_sf <- casual_peak_stations %>%
  left_join(df, by = c("start_station_name")) %>% # Join to get lat/lng
  distinct(start_station_name, start_lat, start_lng) %>% # Remove duplicates
  drop_na(start_lat, start_lng) %>% 
  st_as_sf(coords = c("start_lng", "start_lat"), crs = 4326)

# plot the map
ggplot() +
  annotation_map_tile(type = "osm", zoom = 12) +
  geom_sf(data = casual_start_sf, color = "blue", size = 3, alpha = 0.7) +
  labs(title = "Top Start Stations for Casual Riders (Map)",
       x = "Longitude", y = "Latitude") +
  theme_minimal()
## Zoom: 12

# extract end stations for trips originating at top casual start stations
casual_end_from_start_sf <- df %>%
  filter(!is.na(end_station_name)) %>% 
  filter(start_station_name %in% casual_peak_stations$start_station_name) %>%
  distinct(start_station_name, end_station_name, end_lat, end_lng) %>%
  drop_na(end_lat, end_lng) %>%
  st_as_sf(coords = c("end_lng", "end_lat"), crs = 4326)

# map end stations
ggplot() +
  annotation_map_tile(type = "osm", zoom = 12) +
  geom_sf(data = casual_end_from_start_sf, color = "blue", size = 3, alpha = 0.7) +
  labs(title = "End Stations for Trips from Top Start Stations (Casual Riders)",
       x = "Longitude", y = "Latitude") +
  theme_minimal()
## Zoom: 12

# aggregate casual rider counts by station
casual_peak_stations <- df %>%
  filter(!is.na(end_station_name)) %>% 
  filter(member_casual == "casual") %>%
  count(end_station_name, sort = TRUE) %>%
  top_n(10, n)

# bar chart of top stations for casual riders
ggplot(casual_peak_stations, aes(x = reorder(end_station_name, n), y = n, fill = end_station_name)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  theme_minimal() +
  labs(title = "Top End Stations for Casual Riders", x = "Station Name", y = "Number of Rides")

# Convert top casual end stations to sf object
casual_end_sf <- casual_peak_stations %>%
  filter(!is.na(end_station_name)) %>% 
  left_join(df, by = c("end_station_name")) %>% 
  distinct(end_station_name, end_lat, end_lng) %>% 
  drop_na(end_lat, end_lng) %>% 
  st_as_sf(coords = c("end_lng", "end_lat"), crs = 4326)

# Plot map
ggplot() +
  annotation_map_tile(type = "osm", zoom = 12) +
  geom_sf(data = casual_end_sf, color = "blue", size = 3, alpha = 0.7) +
  labs(title = "Top End Stations for Casual Riders (Map)",
       x = "Longitude", y = "Latitude") +
  theme_minimal()
## Zoom: 12

Roughly looks like 2019.

# extract start stations for trips ending at top casual end stations
casual_start_from_end_sf <- df %>%
  filter(!is.na(start_station_name)) %>% 
  filter(end_station_name %in% casual_peak_stations$end_station_name) %>%
  distinct(end_station_name, start_station_name, start_lat, start_lng) %>%
  drop_na(start_lat, start_lng) %>%
  st_as_sf(coords = c("start_lng", "start_lat"), crs = 4326)

# map: start stations
ggplot() +
  annotation_map_tile(type = "osm", zoom = 12) +
  geom_sf(data = casual_start_from_end_sf, color = "blue", size = 3, alpha = 0.7) +
  labs(title = "Start Stations for Trips Ending at Top End Stations (Casual Riders)",
       x = "Longitude", y = "Latitude") +
  theme_minimal()
## Zoom: 12

# aggregate member rider counts by station
member_peak_stations <- df %>%
  filter(!is.na(start_station_name)) %>% 
  filter(member_casual == "member") %>%
  count(start_station_name, sort = TRUE) %>%
  top_n(10, n)

# bar chart of top stations for member riders
ggplot(member_peak_stations, aes(x = reorder(start_station_name, n), y = n, fill = start_station_name)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  theme_minimal() +
  labs(title = "Top Start Stations for Member Riders", x = "Station Name", y = "Number of Rides")

# convert top member start stations to sf object
member_start_sf <- member_peak_stations %>%
  left_join(df, by = c("start_station_name")) %>% 
  distinct(start_station_name, start_lat, start_lng) %>% 
  drop_na(start_lat, start_lng) %>% 
  st_as_sf(coords = c("start_lng", "start_lat"), crs = 4326)

# plot map
ggplot() +
  annotation_map_tile(type = "osm", zoom = 12) +
  geom_sf(data = member_start_sf, color = "red", size = 3, alpha = 0.7) +
  labs(title = "Top Start Stations for Member Riders (Map)",
       x = "Longitude", y = "Latitude") +
  theme_minimal()
## Zoom: 12

# extract end stations for trips starting at top member start stations
member_end_sf <- df %>%
  filter(!is.na(end_station_name)) %>% 
  filter(start_station_name %in% member_peak_stations$start_station_name) %>%
  distinct(start_station_name, end_station_name, end_lat, end_lng) %>%
  drop_na(end_lat, end_lng) %>%
  st_as_sf(coords = c("end_lng", "end_lat"), crs = 4326)

# plot map
ggplot() +
  annotation_map_tile(type = "osm", zoom = 12) +
  geom_sf(data = member_end_sf, color = "blue", size = 3, alpha = 0.7) +
  labs(title = "End Stations for Trips from Top Start Stations",
       x = "Longitude", y = "Latitude") +
  theme_minimal()
## Zoom: 12

# aggregate member rider counts by station
member_peak_stations <- df %>%
  filter(!is.na(end_station_name)) %>% 
  filter(member_casual == "member") %>%
  count(end_station_name, sort = TRUE) %>%
  top_n(10, n)

# bar chart of top stations for member riders
ggplot(member_peak_stations, aes(x = reorder(end_station_name, n), y = n, fill = end_station_name)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  theme_minimal() +
  labs(title = "Top End Stations for Member Riders", x = "Station Name", y = "Number of Rides")

# Convert top member end stations to sf object
member_end_sf <- member_peak_stations %>%
  left_join(df, by = c("end_station_name")) %>% 
  distinct(end_station_name, end_lat, end_lng) %>% 
  drop_na(end_lat, end_lng) %>% 
  st_as_sf(coords = c("end_lng", "end_lat"), crs = 4326)

# Plot map
ggplot() +
  annotation_map_tile(type = "osm", zoom = 12) +
  geom_sf(data = member_end_sf, color = "red", size = 3, alpha = 0.7) +
  labs(title = "Top End Stations for Member Riders (Map)",
       x = "Longitude", y = "Latitude") +
  theme_minimal()
## Zoom: 12

Again, looks like 2019. Members go to The Loop, Chicago’s business district.

# Extract start stations for trips ending at top member end stations
member_start_sf <- df %>%
  filter(!is.na(start_station_name)) %>% 
  filter(end_station_name %in% member_peak_stations$end_station_name) %>%
  distinct(end_station_name, start_station_name, start_lat, start_lng) %>%
  drop_na(start_lat, start_lng) %>%
  st_as_sf(coords = c("start_lng", "start_lat"), crs = 4326)

# Plot map
ggplot() +
  annotation_map_tile(type = "osm", zoom = 12) +
  geom_sf(data = member_start_sf, color = "blue", size = 3, alpha = 0.7) +
  labs(title = "Start Stations for Trips Ending at Top End Stations",
       x = "Longitude", y = "Latitude") +
  theme_minimal()
## Zoom: 12

# prep heatmaps

# bounding box for Chicago (approximate lat/lng bounds) to exclude strange outliers
chicago_bbox <- st_bbox(c(xmin = -88.1, xmax = -87.3, ymin = 41.5, ymax = 42.1), crs = st_crs(4326))

# filter to bbox
df_bbox <- df %>%
  filter(end_lng >= chicago_bbox["xmin"], end_lng <= chicago_bbox["xmax"],
         end_lat >= chicago_bbox["ymin"], end_lat <= chicago_bbox["ymax"])

# electric vs. classic heatmaps
electric <- df_bbox %>% filter(rideable_type == "electric_bike")
classic <- df_bbox %>% filter(rideable_type == "classic_bike")

# member vs. casual heatmaps
member <- df_bbox %>% filter(member_casual == "member")
casual <- df_bbox %>% filter(member_casual == "casual")
# heatmap function
plot_heatmap <- function(data, title) {
  ggplot(data, aes(x = start_lng, y = start_lat)) +
    geom_bin2d(bins = 100) + # Adjust bins for granularity
    scale_fill_gradient(low = "blue", high = "red") +
    theme_minimal() +
    labs(title = title, x = "Longitude", y = "Latitude")
}
plot_heatmap(electric, "Electric Bike Trips Heatmap")

plot_heatmap(classic, "Classic Bike Trips Heatmap")

plot_heatmap(member, "Member Trips Heatmap")

plot_heatmap(casual, "Casual Trips Heatmap")

# # contour plot
# plot_contour_heatmap <- function(data, title) {
#   ggplot(data, aes(x = end_lng, y = end_lat)) +
#     annotation_map_tile(type = "osm", zoom = 12) +  # OSM background
#     geom_density2d_filled(alpha = 0.6) +  # Adjust transparency
#     theme_minimal() +
#     labs(title = title, x = "Longitude", y = "Latitude")
# }
library(leaflet)
library(leaflet.extras)
## Warning: package 'leaflet.extras' was built under R version 4.4.3
plot_leaflet_heatmap <- function(data, title, sample_size = 50000) {
  data_sample <- data %>% sample_n(sample_size)

  # Compute density (counts per unique lat/lng combination)
  density_data <- data_sample %>%
    count(end_lng, end_lat)

  # Define a continuous color palette
  color_palette <- colorNumeric(palette = "viridis", domain = density_data$n)

  leaflet(data_sample) %>%
    addTiles() %>%
    setView(lng = -87.63, lat = 41.88, zoom = 13) %>%  # Center on downtown Chicago
    addHeatmap(lng = ~end_lng, lat = ~end_lat, blur = 15, max = 1, radius = 10) %>%
    addLegend(position = "bottomright",
              pal = color_palette,
              values = density_data$n,
              title = title)
}
set.seed(42)  # Global seed for reproducibility

# Sample each group separately
# electric <- df_bbox %>% filter(rideable_type == "electric_bike") %>% sample_n(50000)
# classic <- df_bbox %>% filter(rideable_type == "classic_bike") %>% sample_n(50000)
member <- df_bbox %>% filter(member_casual == "member") %>% sample_n(50000)
casual <- df_bbox %>% filter(member_casual == "casual") %>% sample_n(50000)
# # Plot Contour Heatmaps
# plot_contour_heatmap(electric, "Electric Bike Trips Heatmap")
# plot_contour_heatmap(classic, "Classic Bike Trips Heatmap")
# plot_contour_heatmap(member, "Member Trips Heatmap")
# plot_contour_heatmap(casual, "Casual Trips Heatmap")
# Plot Leaflet Heatmaps
# plot_leaflet_heatmap(electric, "Electric Bike Trips")
# plot_leaflet_heatmap(classic, "Classic Bike Trips")
plot_leaflet_heatmap(member, "Member Bike Trips")
plot_leaflet_heatmap(casual, "Casual Bike Trips")
# library(leaflet)
# library(leaflet.extras)
# library(dplyr)
# 
# plot_leaflet_heatmap <- function(data, lng_col, lat_col, title, zoom_level = 13) {
#   # Remove rows with missing lat/lng values
#   data <- data %>% filter(!is.na(!!sym(lng_col)) & !is.na(!!sym(lat_col)))
#   
#   density_data <- data %>% count(!!sym(lng_col), !!sym(lat_col))
#   color_palette <- colorNumeric(palette = "viridis", domain = density_data$n)
#   
#   leaflet(data) %>%
#     addTiles() %>%
#     setView(lng = -87.63, lat = 41.88, zoom = zoom_level) %>%
#     addHeatmap(lng = as.formula(paste0("~", lng_col)), 
#                lat = as.formula(paste0("~", lat_col)), 
#                blur = 15, max = 1, radius = 10) %>%
#     addLegend(position = "bottomright", 
#               pal = color_palette, 
#               values = density_data$n, 
#               title = title)
# }
# 
# # Filter subsets
# member <- df %>% filter(member_casual == "member")
# casual <- df %>% filter(member_casual == "casual")
# Generate heatmaps at default zoom (downtown focus)
# plot_leaflet_heatmap(member, "start_lng", "start_lat", "Member Start Locations Heatmap", zoom_level = 13)
# plot_leaflet_heatmap(member, "end_lng", "end_lat", "Member End Locations Heatmap", zoom_level = 13)
# plot_leaflet_heatmap(casual, "start_lng", "start_lat", "Casual Start Locations Heatmap", zoom_level = 13)
# plot_leaflet_heatmap(casual, "end_lng", "end_lat", "Casual End Locations Heatmap", zoom_level = 13)
# plot_leaflet_heatmap(member, "start_lng", "start_lat", "Member Start Locations Heatmap (Greater Chicago)", zoom_level = 11)
# plot_leaflet_heatmap(member, "end_lng", "end_lat", "Member End Locations Heatmap (Greater Chicago)", zoom_level = 11)
# plot_leaflet_heatmap(casual, "start_lng", "start_lat", "Casual Start Locations Heatmap (Greater Chicago)", zoom_level = 11)
# plot_leaflet_heatmap(casual, "end_lng", "end_lat", "Casual End Locations Heatmap (Greater Chicago)", zoom_level = 11)